This document includes the analysis of simple and complex satellite abundances in different Black/6 and Black/10 mouse strains, and uses the written history (generations at splits along the “phylogeny”) to calculate rates of copy number change.
First, load in and process the data
data <- read.table("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/mouse_MA_30x_trimmed_data_kseek.rep.normGC", header=T)
#data[1:5, 1:5]
nrow(data)
## [1] 13749
#colnames(data)
#need to flip the data frame since it's in the opposite orientation
data2 <- data.frame(t(data[-1]))
# match the file names to the names of the lines
#data2[, 1:5]
colnames(data2) <- data[,1]
line_names <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/mouse_line_names.csv", header=F)
line_files <- as.vector(line_names[,1])
line_names <- as.vector(line_names[,2])
names(line_names) <- line_files
line_names_df <- as.data.frame(line_names)
data_withnames <- merge (data2, line_names, by=0)
lines <- as.vector(data_withnames[,"y"])
data_withnames[,"y"] <- NULL
data_withnames[,1] <- NULL
rownames (data_withnames) <- lines
#data_withnames[,1:3]
#data2[,1:5]
# simplify the kmer names
kmer.labels <- sapply(strsplit(colnames(data_withnames), "\\/"), '[', 1)
colnames(data_withnames) <- kmer.labels
# now calculate the means and sort the data
kmer_means <- colMeans(data_withnames)
data_sorted <- data_withnames[,order(kmer_means, decreasing=T)]
Next, select the kmers to focus on based on abundance, and plot PCAs
# first use common kmers, which have an average abundance of at least 100
get_common_kmers <- function (kmer_matrix) {
common.kmer.indexes <- c()
for (i in 1:ncol(kmer_matrix)) {
if (mean(kmer_matrix[,i])>=100) {
common.kmer.indexes <- c(common.kmer.indexes, i)
}
}
names(common.kmer.indexes) <- colnames(kmer_matrix)[common.kmer.indexes]
return (common.kmer.indexes)
}
data_sorted_common <- data_sorted[,get_common_kmers(data_sorted)]
#data_sorted_common[,1:5]
ncol(data_sorted_common)
## [1] 63
AG_repeats <- c("AG", "AAG", "AAAG", "AGG", "AAGGAG", "AAGG", "AAAGG", "AAGAG", "AAGGG", "AGAGG", "AAAAAG", "AAAGAG", "AAAGGG", "AAAAAG", "AGAGGG", "AAGAGG", "AAAGGAAG", "AGGG")
data_sorted_common[,AG_repeats]
## AG AAG AAAG AGG AAGGAG AAGG
## BL6/NTac 67256.45 26347.52 7296.452 8533.097 2418.486 2198.468
## BL6/NCrl 69750.00 38055.28 10579.327 8329.694 2560.842 2322.308
## BL6/NHsd 48925.53 16230.10 5543.591 7846.022 1587.977 1526.284
## BL6N-TyrCBrdCrlCrl 70561.23 38934.90 10379.190 9185.140 2827.708 2397.239
## BL6/JBomTac 66683.30 32800.02 9477.913 8461.805 2386.488 2309.149
## BL6/J 60032.36 27832.02 7200.989 9617.130 2744.191 2227.678
## BL10/ScSnJ 64246.86 40959.64 8851.526 10087.380 2986.484 2297.634
## BL10/SnJ 62270.60 42314.92 10284.460 9944.040 2981.819 2302.539
## BL10/ScCr 50443.03 17614.41 5509.765 10019.725 2079.948 1839.200
## BL6/NJ 63688.51 39818.80 9769.525 9866.128 2859.624 2327.099
## BL10/J 66349.53 41738.66 10232.616 8473.754 2656.981 2079.099
## BL6/ByJ 65096.19 36217.82 10223.577 8199.549 2427.937 2149.029
## BL6/JEiJ 62512.38 33133.06 9364.256 8260.159 2339.655 2148.763
## BL10/ScNHsd 63805.98 38974.22 10193.485 7739.205 2386.076 2139.598
## AAAGG AAGAG AAGGG AGAGG AAAAAG
## BL6/NTac 1736.9611 1042.5865 999.5002 535.6924 245.2341
## BL6/NCrl 1881.9835 1211.2068 960.0620 565.0340 219.2628
## BL6/NHsd 875.7659 624.7763 835.4631 449.1689 244.1678
## BL6N-TyrCBrdCrlCrl 1950.5955 1203.5164 1104.6945 629.0574 237.9968
## BL6/JBomTac 1835.7438 1230.6188 944.1000 588.7770 226.2638
## BL6/J 1656.7825 1041.2083 974.7886 573.7315 247.8997
## BL10/ScSnJ 1869.7141 1199.3150 1018.3826 583.8824 229.5712
## BL10/SnJ 1925.2236 1227.2337 983.7501 579.4867 246.9304
## BL10/ScCr 992.8838 685.8580 1007.0211 540.3200 268.6505
## BL6/NJ 1897.0839 1204.7733 1020.5747 644.0331 228.0036
## BL10/J 1762.1477 1211.4311 799.3172 492.8769 271.2413
## BL6/ByJ 1781.3128 1106.8807 881.1411 512.5859 248.5310
## BL6/JEiJ 1737.5669 1089.5450 843.0328 528.1448 258.9078
## BL10/ScNHsd 1792.5522 1162.6397 830.4409 536.9965 216.8835
## AAAGAG AAAGGG AAAAAG.1 AGAGGG AAGAGG AAAGGAAG
## BL6/NTac 263.2982 251.1064 245.2341 223.9919 139.93000 122.36293
## BL6/NCrl 360.0023 253.2182 219.2628 198.2796 154.23436 152.96323
## BL6/NHsd 157.4740 191.8332 244.1678 184.2827 59.50754 60.94818
## BL6N-TyrCBrdCrlCrl 381.7134 297.4409 237.9968 235.0870 170.15005 139.63608
## BL6/JBomTac 359.0054 247.6448 226.2638 221.1700 132.16278 142.68280
## BL6/J 278.0870 294.7840 247.8997 187.6473 128.91459 114.64040
## BL10/ScSnJ 369.5459 297.4225 229.5712 191.3634 136.97390 135.56400
## BL10/SnJ 417.5889 312.8885 246.9304 175.6377 138.50373 138.72122
## BL10/ScCr 164.5873 247.0828 268.6505 206.0989 71.88263 60.08225
## BL6/NJ 417.2335 305.4141 228.0036 210.1792 151.32141 136.65068
## BL10/J 399.0728 264.1721 271.2413 208.6264 127.90973 144.12123
## BL6/ByJ 354.3842 240.8161 248.5310 195.0101 132.82618 143.98770
## BL6/JEiJ 320.7037 235.4233 258.9078 201.1888 118.99387 129.50019
## BL10/ScNHsd 338.5952 212.5425 216.8835 216.0247 124.51339 147.25015
## AGGG
## BL6/NTac 142.7499
## BL6/NCrl 123.2662
## BL6/NHsd 119.5792
## BL6N-TyrCBrdCrlCrl 146.3483
## BL6/JBomTac 133.7977
## BL6/J 122.2764
## BL10/ScSnJ 135.3885
## BL10/SnJ 125.6165
## BL10/ScCr 151.9664
## BL6/NJ 133.4303
## BL10/J 117.8604
## BL6/ByJ 110.3731
## BL6/JEiJ 132.9163
## BL10/ScNHsd 117.4796
sum(kmer_means[AG_repeats])
## [1] 125050.6
AC_repeats <- c("AC", "ACC", "AACC", "AAAAAC", "AAAACC", "AAAAACC", "AAACC")
sum(kmer_means[AC_repeats])
## [1] 192392.3
which(colnames(data_sorted_common)=="AGGG")
## [1] 51
# now make a PCA using these 63 kmers
pca.data <- prcomp(data_sorted_common, scale.=T)
library(ggbiplot)
## Loading required package: ggplot2
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2020a.
## 1.0/zoneinfo/America/New_York'
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, labels=rownames(data_sorted_common))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8,choices=c(1,3), labels=rownames(data_sorted_common))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
# do a PCA using more data - see if there is separation.
get_legit_kmer_indexes <- function (kmer_mx) {
indexes <- c()
for (i in 1:ncol(kmer_mx)) { ## go through each column of the data
p <- kmer_mx[,i] >= 10 ## at least 10 copies
if (sum(p) >= 1) { # if one sample have at least 2 copies
indexes <- c(indexes, i)
}
}
return(indexes)
}
legit_indexes <- get_legit_kmer_indexes(data_sorted)
length(legit_indexes)
## [1] 434
# 434 kmers have at least 10 copies in at least one sample - use these now for the PCA.
legit_data <- data_sorted[,legit_indexes]
# change the name
# colour them differently
rownames(legit_data)[4] <- "BL6/NTyrCBrdCrlCrl"
strain <- sapply(strsplit(as.vector(rownames(legit_data)), "\\/"), '[', 1)
pca.data <- prcomp(legit_data, scale.=T)
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, labels=rownames(legit_data), labels.size=2)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
# here's some separation
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, groups = strain, var.axes = F, alpha=0.8, choices=c(1,3))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, choices=c(1,3), labels=rownames(legit_data), labels.size=2)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
Get the amount of satellites in base pairs in each line
get_abskmer_mx <- function (kmer_mx, indexes) {
kmers.abs <- matrix(nrow=nrow(kmer_mx), ncol=max(indexes))
for (i in indexes) {
for (j in 1:nrow(kmer_mx)) {
kmers.abs[j,i] <- kmer_mx[j,i] * (nchar(colnames(kmer_mx)[i], type="chars"))
}
}
kmers.abs <- kmers.abs[,colSums(is.na(kmers.abs)) != nrow(kmers.abs)]# get rid of columns that are all NAs
rownames(kmers.abs) <- rownames(kmer_mx)
colnames(kmers.abs) <- colnames(kmer_mx)
return(kmers.abs)
}
# get total kmer content per MA line
get_total_kmercontent <- function (abs_mx) {
total.kmers.abs <- c()
for (i in 1:nrow(abs_mx)){
total.kmers.abs[i] <- sum(abs_mx[i,])/10^6
}
names(total.kmers.abs) <- rownames(abs_mx)
return (total.kmers.abs)
}
data_abs <- get_abskmer_mx(legit_data, c(1:ncol(legit_data)))
total_kmer_content <- get_total_kmercontent(data_abs)
total_kmer_content # note, this is in MB
## BL6/NTac BL6/NCrl BL6/NHsd
## 1.222917 1.288536 1.073765
## BL6/NTyrCBrdCrlCrl BL6/JBomTac BL6/J
## 1.393987 1.270839 1.388050
## BL10/ScSnJ BL10/SnJ BL10/ScCr
## 1.472052 1.469211 1.179894
## BL6/NJ BL10/J BL6/ByJ
## 1.469178 1.274572 1.285923
## BL6/JEiJ BL10/ScNHsd
## 1.215779 1.250947
boxplot(total_kmer_content, las=2)
Bring in the tree and visualize how kmer abundance evolves along the tree
library(phytools)
## Loading required package: ape
## Loading required package: maps
##
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
##
## ozone
mouse_tree <- read.tree("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/manual_ext.tre")
plot(mouse_tree)
# need to rename so they match the tree
new_rownames <- gsub("/", "", names(total_kmer_content))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2
names(total_kmer_content) <- new_rownames2
#total_kmer_content
obj<-contMap(mouse_tree,total_kmer_content,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
Get kmer lengths and GC content
Calc_kmer_length <- function (kmer_string) {
return (nchar(kmer_string))
}
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 3.3.3
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:plyr':
##
## rename
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.3.3
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:plyr':
##
## desc
## Loading required package: XVector
## Warning: package 'XVector' was built under R version 3.3.3
##
## Attaching package: 'XVector'
## The following object is masked from 'package:plyr':
##
## compact
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
Calc_GC_content <- function (kmer_string) {
string <- BString(kmer_string)
gc.content <- c(letterFrequency(string, letters=c("CG"), as.prob=F)/(length(string)))
return(gc.content)
}
common.kmers <- colnames(data_sorted_common)
common.kmer.lengths <- sapply(common.kmers, Calc_kmer_length)
names(common.kmer.lengths) <- common.kmers
#common.kmer.lengths[1:10]
common.kmer.gc <- sapply(common.kmers, Calc_GC_content)
names(common.kmer.gc) <- common.kmers
#common.kmer.gc[1:10]
hist(common.kmer.lengths, breaks=20)
summary(common.kmer.lengths) # median = 6
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 4.000 6.000 6.524 6.000 19.000
length(which(common.kmer.lengths==4)) # 11
## [1] 11
length(which(common.kmer.lengths==5)) #8
## [1] 8
length(which(common.kmer.lengths==6)) # 22
## [1] 22
length(common.kmer.lengths)
## [1] 63
hist(common.kmer.gc, breaks=10)
summary(common.kmer.gc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3333 0.5000 0.4456 0.5895 0.7500
# look at legit kmers now
legit.kmers <- colnames(legit_data)
legit.kmer.lengths <- sapply(legit.kmers, Calc_kmer_length)
names(legit.kmer.lengths) <- legit.kmers
legit.kmer.gc <- sapply(legit.kmers, Calc_GC_content)
names(legit.kmer.gc) <- legit.kmers
hist(legit.kmer.lengths, breaks=20)
hist(legit.kmer.gc, breaks=10)
# note: telomere satellite is common.kmers[4]:
common.kmers[4]
## [1] "AACCCT"
total_kmer_content
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 1.222917 1.288536 1.073765 1.393987
## B6JBomTac B6J B10ScSnJ B10SnJ
## 1.270839 1.388050 1.472052 1.469211
## B10ScCr B6NJ B10J B6ByJ
## 1.179894 1.469178 1.274572 1.285923
## B6JEiJ B10ScNHsd
## 1.215779 1.250947
Boxplot of abundant kmers
abun_data <- c()
for (i in 1:ncol(data_sorted_common)) {
abun_data <- c(abun_data, data_sorted_common[,i])
}
kmer_names <- c()
for (i in 1:ncol(data_sorted_common)) {
kmer_names <- c(kmer_names, c(rep(colnames(data_sorted_common)[i], times = nrow(data_sorted_common))))
}
rownames(data_sorted_common)[4] <- "BL6/NTyrCBrdCrlCrl"
abun_df <- data.frame(rownames(data_sorted_common), kmer_names, abun_data)
abun_df[1:15, ]
## rownames.data_sorted_common. kmer_names abun_data
## 1 BL6/NTac AC 185511.68
## 2 BL6/NCrl AC 182274.28
## 3 BL6/NHsd AC 182867.67
## 4 BL6/NTyrCBrdCrlCrl AC 192672.86
## 5 BL6/JBomTac AC 188531.48
## 6 BL6/J AC 202091.49
## 7 BL10/ScSnJ AC 202385.62
## 8 BL10/SnJ AC 199890.74
## 9 BL10/ScCr AC 204310.46
## 10 BL6/NJ AC 197374.94
## 11 BL10/J AC 175373.72
## 12 BL6/ByJ AC 178864.78
## 13 BL6/JEiJ AC 180017.66
## 14 BL10/ScNHsd AC 182467.11
## 15 BL6/NTac AG 67256.45
colnames(abun_df) <- c("Sample", "kmer", "Copy")
strain <- sapply(strsplit(as.vector(abun_df$Sample), "\\/"), '[', 1)
abun_df <- data.frame(abun_df, strain)
#abun_df
library(ggplot2)
abun_df$kmer <- factor(abun_df$kmer, levels=colnames(data_sorted_common))
# this plot needs some work
ggplot(data = abun_df[c(1:70),], aes(x = kmer, y = Copy)) +
geom_boxplot() +
geom_jitter(aes(color=strain)) +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
ggplot(data = abun_df[c(71:140),], aes(x = kmer, y = Copy)) +
geom_boxplot() +
geom_jitter(aes(color=strain)) +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
Now, calculate mutation rates: Set it up with functions
# note, this part is needed to make the table I import in:
mouse_tree$tip.label
## [1] "B10SnJ" "B10ScSnJ" "B10ScCr"
## [4] "B10ScNHsd" "B10J" "B6ByJ"
## [7] "B6NCrl" "B6NHsd" "B6NTac"
## [10] "B6NTyrCBrdCrlCrl" "B6NJ" "B6JEiJ"
## [13] "B6JBomTac" "B6J"
mouse_tree$Nnode # 13 nodes now
## [1] 13
findMRCA(mouse_tree, tips=mouse_tree$tip.label, type="node")
## [1] 15
findMRCA(mouse_tree, tips=c("B6J", "B6ByJ"), type="node")
## [1] 20
findMRCA(mouse_tree, tips=c("B6J", "B6JEiJ"), type="node")
## [1] 26
findMRCA(mouse_tree, tips=c("B6J", "B6JBomTac"), type="node")
## [1] 27
findMRCA(mouse_tree, tips=c("B6NJ", "B6ByJ"), type="node")
## [1] 21
findMRCA(mouse_tree, tips=c("B6NJ", "B6NCrl"), type="node")
## [1] 22
findMRCA(mouse_tree, tips=c("B6NJ", "B6NHsd"), type="node")
## [1] 23
findMRCA(mouse_tree, tips=c("B6NJ", "B6NTac"), type="node")
## [1] 24
findMRCA(mouse_tree, tips=c("B6NJ", "B6NTyrCBrdCrlCrl"), type="node")
## [1] 25
findMRCA(mouse_tree, tips=c("B10J", "B10SnJ"), type="node")
## [1] 16
findMRCA(mouse_tree, tips=c("B10J", "B10ScSnJ"), type="node")
## [1] 17
findMRCA(mouse_tree, tips=c("B10J", "B10ScCr"), type="node")
## [1] 18
findMRCA(mouse_tree, tips=c("B10J", "B10ScNHsd"), type="node")
## [1] 19
# here is the node info file:
node_info <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/node_info_Mar17-20.csv", header = T)
# here is a function to calculate mutation rates for our tree, given an abundance matrix
calc_mutation_rates <- function (df_abun) {
mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
for (i in 1:ncol(df_abun)) {
kmer_subset <- df_abun[,i]
names(kmer_subset) <- rownames(df_abun)
anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T)
for (j in 1:nrow(df_abun)) {
subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
mu_mx[j,i] <- (df_abun[j,i] - anc_states$ace[subset_table$nearest_node]) / subset_table$gens
}
}
rownames(mu_mx) <- rownames(df_abun)
colnames(mu_mx) <- colnames(df_abun)
return(mu_mx)
}
# check for total kmer content
anc_states <- fastAnc(mouse_tree, total_kmer_content, vars=T, CI=T)
anc_states
## Ancestral character estimates using fastAnc:
## 15 16 17 18 19 20 21 22
## 1.314349 1.347041 1.338262 1.309295 1.282353 1.281658 1.275193 1.270799
## 23 24 25 26 27
## 1.261257 1.299648 1.371612 1.283420 1.305079
##
## Variances on ancestral states:
## 15 16 17 18 19 20 21 22
## 0.019104 0.005736 0.004946 0.004616 0.003865 0.005283 0.003684 0.003130
## 23 24 25 26 27
## 0.002635 0.002244 0.001809 0.003905 0.003184
##
## Lower & upper 95% CIs:
## lower upper
## 15 1.043446 1.585252
## 16 1.198596 1.495486
## 17 1.200424 1.476101
## 18 1.176135 1.442455
## 19 1.160505 1.404202
## 20 1.139196 1.424119
## 21 1.156236 1.394149
## 22 1.161153 1.380445
## 23 1.160639 1.361876
## 24 1.206795 1.392501
## 25 1.288246 1.454979
## 26 1.160935 1.405906
## 27 1.194482 1.415675
1.073765 - 1.264150 # lost by B6NHsd since common ancestor with below
## [1] -0.190385
1.469178 - 1.264150 # gained by B6NJ since common ancestor with below
## [1] 0.205028
# here is a function that calculates normalized-by-copy-number mutation rates, given the original abundance matrix and the non-normalized mutation matrix
calc_norm_mu <- function (df_abun, mu_mx) {
mu_mx_norm <- matrix(nrow=nrow(mu_mx), ncol=ncol(mu_mx))
for (i in 1:ncol(mu_mx)) {
kmer_subset <- df_abun[,i]
names(kmer_subset) <- rownames(df_abun)
anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T) # get the ancestral copy number
for (j in 1:nrow(mu_mx)) {
subset_table <- node_info[which(node_info$Sample==rownames(mu_mx)[j]),]
mu_mx_norm[j,i] <- mu_mx[j,i]/anc_states$ace[subset_table$nearest_node]
}
}
rownames(mu_mx_norm) <- rownames(mu_mx)
colnames(mu_mx_norm) <- colnames(mu_mx)
return(mu_mx_norm)
}
# this is a function that calculates absolute mutation rates (taking the absolute value of additions and subtractions).
calc_mutation_rates_abs <- function (df_abun) {
mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
for (i in 1:ncol(df_abun)) {
kmer_subset <- df_abun[,i]
names(kmer_subset) <- rownames(df_abun)
anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T)
for (j in 1:nrow(df_abun)) {
subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
mu_mx[j,i] <- (abs(df_abun[j,i] - anc_states$ace[subset_table$nearest_node])) / subset_table$gens
}
}
rownames(mu_mx) <- rownames(df_abun)
colnames(mu_mx) <- colnames(df_abun)
return(mu_mx)
}
Now, do the calculations for the common kmers
new_rownames <- gsub("/", "", rownames(data_sorted_common))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2
rownames(data_sorted_common) <- new_rownames2
# regular
mu_common <- calc_mutation_rates(data_sorted_common)
per_kmer_mu<- colMeans(mu_common)
length(per_kmer_mu)
## [1] 63
kmer_means_sorted <- sort (kmer_means, decreasing = T)
names(kmer_means_sorted)[1:63]
## [1] "AC" "AG" "AAG"
## [4] "AACCCT" "AGAT" "AGAGGC"
## [7] "AAAG" "AGG" "ACAGC"
## [10] "AAGGAG" "AAGG" "ACC"
## [13] "ATCC" "AAAGG" "ACAG"
## [16] "AAGTCTGCACACACATACT" "ACCT" "ACACAG"
## [19] "AT" "AAGAG" "AAGGG"
## [22] "AGAGAGGC" "ACATAT" "ACAGACAGCACAGCACAGC"
## [25] "AGC" "AGAGG" "AAAGACAGACAG"
## [28] "AAAAACAAGGGAGATAT" "AGGC" "ACAGACAGCACAGC"
## [31] "AACC" "AAAAG" "AAAGAG"
## [34] "AGATAT" "AGCAGG" "ACAGAT"
## [37] "AAAGGG" "AACAAG" "AAAAAG"
## [40] "AGGAGGC" "AAGC" "AGAGGG"
## [43] "AGAGAGGCAGAGGC" "AAAGC" "ACAGGC"
## [46] "AAACAAGGGAGATAT" "ACAGAG" "ACAT"
## [49] "ACT" "AGAGAT" "AGGG"
## [52] "AAAAAC" "AAGAGG" "AAAGGAAG"
## [55] "AAAAATCTTAAAGG" "ACTGCT" "ACACAT"
## [58] "AAACC" "AAAACC" "AAAAACC"
## [61] "ACACATAT" "AGCAGGAGGAGG" "ACATAG"
plot(x = kmer_means_sorted[1:63], y = per_kmer_mu )
plot (x = kmer_means_sorted[7:63], y = per_kmer_mu[7:63])
# generally a positive correlation
# for each line, do a sign test.
binom_results <- c()
for (i in 1:nrow(mu_common)) {
num_neg <- length(which(mu_common[i, ] < 0))
total <- ncol(mu_common)
p <- binom.test(num_neg, total)
binom_results <- c(binom_results, p$p.value)
}
binom_results
## [1] 8.980472e-04 1.000000e+00 5.152397e-03 4.295655e-02 3.135031e-01
## [6] 5.152397e-03 2.227532e-03 3.760612e-05 8.013065e-01 3.367093e-04
## [11] 1.000000e+00 1.000000e+00 8.980472e-04 4.295655e-02
rownames(mu_common)[which(binom_results < 0.01)]
## [1] "B6NTac" "B6NHsd" "B6J" "B10ScSnJ" "B10SnJ" "B6NJ"
## [7] "B6JEiJ"
### now do for each line - plot the common kmers abundance and mutation rate
#install.packages("ggallin")
library("ggallin")
rownames(data_sorted_common[1,])
## [1] "B6NTac"
a <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[1,]), y = as.numeric(mu_common[1,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[1,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[2,])
## [1] "B6NCrl"
b <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[2,]), y = as.numeric(mu_common[2,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[2,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[3,])
## [1] "B6NHsd"
c <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[3,]), y = as.numeric(mu_common[3,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[3,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[4,])
## [1] "B6NTyrCBrdCrlCrl"
d <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[4,]), y = as.numeric(mu_common[4,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[4,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[5,])
## [1] "B6JBomTac"
e <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[5,]), y = as.numeric(mu_common[5,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[5,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[6,])
## [1] "B6J"
f <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[6,]), y = as.numeric(mu_common[6,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[6,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[7,])
## [1] "B10ScSnJ"
g <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[7,]), y = as.numeric(mu_common[7,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[7,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[8,])
## [1] "B10SnJ"
h <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[8,]), y = as.numeric(mu_common[8,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[8,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[9,])
## [1] "B10ScCr"
i <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[9,]), y = as.numeric(mu_common[9,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[9,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[10,])
## [1] "B6NJ"
j <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[10,]), y = as.numeric(mu_common[10,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[10,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[11,])
## [1] "B10J"
k <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[11,]), y = as.numeric(mu_common[11,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[11,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[12,])
## [1] "B6ByJ"
l <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[12,]), y = as.numeric(mu_common[12,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[12,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[13,])
## [1] "B6JEiJ"
m <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[13,]), y = as.numeric(mu_common[13,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[13,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[14,])
## [1] "B10ScNHsd"
n <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[14,]), y = as.numeric(mu_common[14,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = rownames(data_sorted_common[14,])) +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:BiocGenerics':
##
## combine
grid.arrange(a,b,c,d,e,f,g,h,i,j,k,l,m,n, ncol=2)
#normalize by copy number
mu_common_norm <- calc_norm_mu(data_sorted_common, mu_common)
# per kmer average mutation rate
per_kmer_mu_norm_mean <- colMeans(mu_common_norm)
par(mar = c(2, 2, 2, 2))
plot(per_kmer_mu_norm_mean)
# one kmer has the most negative mutatation rate
which(per_kmer_mu_norm_mean==min(per_kmer_mu_norm_mean)) # normalized by copy number, most contracting
## AAAGACAGACAG
## 27
# this is the one with phylogenetic signal
# verified this one manually to make sure there was no mistake.
highest_kmer <- data_sorted_common[,27]
names(highest_kmer) <- rownames(data_sorted_common)
# look at amounts at each node
anc_states <- fastAnc(mouse_tree, highest_kmer, vars=T, CI=T)
anc_states$ace
## 15 16 17 18 19 20 21
## 420.27294 72.76111 57.51539 43.91212 21.99731 767.78476 788.11806
## 22 23 24 25 26 27
## 805.51032 851.40148 864.22009 872.23266 824.66497 846.29339
obj<-contMap(mouse_tree,highest_kmer,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
par(mar = c(2, 2, 2, 2))
per_line_mu_norm_mean <- rowMeans(mu_common_norm)
plot(per_line_mu_norm_mean) # line 10 is most increasing, line 3 is most decreasing
per_line_mu_norm_mean
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## -5.108446e-04 1.018382e-04 -9.047817e-04 2.279445e-04
## B6JBomTac B6J B10ScSnJ B10SnJ
## -2.278171e-04 4.288457e-04 1.220611e-04 1.862966e-04
## B10ScCr B6NJ B10J B6ByJ
## -3.973537e-04 8.150201e-04 2.706781e-05 -3.471784e-05
## B6JEiJ B10ScNHsd
## -1.783642e-04 -3.200731e-04
# look at the telomere satellite
telomere_kmer <- data_sorted_common[,4]
names(telomere_kmer) <- rownames(data_sorted_common)
telomere_kmer
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 30146.60 24934.31 26172.35 31314.12
## B6JBomTac B6J B10ScSnJ B10SnJ
## 25013.94 45939.98 46029.90 40211.11
## B10ScCr B6NJ B10J B6ByJ
## 35171.94 39748.29 22781.08 30818.52
## B6JEiJ B10ScNHsd
## 24649.66 26089.82
# look at amounts at each node
anc_states <- fastAnc(mouse_tree, telomere_kmer, vars=T, CI=T)
anc_states$ace
## 15 16 17 18 19 20 21 22
## 33122.94 35614.98 35362.01 33648.22 28314.51 30630.91 30000.19 29604.91
## 23 24 25 26 27
## 30121.95 31568.94 33730.17 30980.68 33096.57
obj<-contMap(mouse_tree,telomere_kmer,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
# range 22781 - 46030 - factor of 2.
# look at CAG repeats (polyQ)
which(names((data_sorted_common))=="AGC")
## [1] 25
poly_Q <- data_sorted_common[,25]
names(poly_Q) <- rownames(data_sorted_common)
anc_states <- fastAnc(mouse_tree, poly_Q, vars=T, CI=T)
anc_states$ace
## 15 16 17 18 19 20 21 22
## 587.9395 588.3390 589.6813 588.8991 584.2025 587.5399 584.7799 584.6911
## 23 24 25 26 27
## 585.0344 585.1220 585.3344 591.7505 587.1016
obj<-contMap(mouse_tree,poly_Q,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
poly_Q # very tight: 572-613 copies
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 584.8186 582.6070 585.9047 588.1745
## B6JBomTac B6J B10ScSnJ B10SnJ
## 572.0243 591.7187 609.6128 573.4039
## B10ScCr B6NJ B10J B6ByJ
## 595.1478 582.8482 577.5038 577.1086
## B6JEiJ B10ScNHsd
## 613.4282 584.0698
## Let's make a box plot of mutation rates ###
head(mu_common_norm)
## AC AG AAG AACCCT
## B6NTac -1.665755e-04 0.0004075430 -0.0016936235 -0.0004374261
## B6NCrl -1.037055e-04 0.0005307961 0.0010510781 -0.0008913244
## B6NHsd -1.520792e-04 -0.0015642071 -0.0033459144 -0.0009501471
## B6NTyrCBrdCrlCrl 4.070876e-05 0.0010917158 0.0012808578 -0.0011192028
## B6JBomTac -1.399663e-04 0.0004652882 0.0004670982 -0.0021052905
## B6J 4.700042e-04 -0.0004409409 -0.0009093677 0.0033453313
## AGAT AGAGGC AAAG AGG
## B6NTac -0.0036641401 -3.605975e-04 -0.0015142443 -3.295951e-04
## B6NCrl 0.0015920712 -3.000287e-04 0.0010430853 -1.410237e-04
## B6NHsd -0.0043442036 2.057965e-06 -0.0024647120 -6.330540e-04
## B6NTyrCBrdCrlCrl 0.0009625905 2.326053e-04 0.0015827283 -4.349942e-05
## B6JBomTac 0.0017003030 -2.519822e-05 0.0008803666 -3.848845e-04
## B6J -0.0017989410 2.586653e-04 -0.0014021171 7.395840e-04
## ACAGC AAGGAG AAGG ACC
## B6NTac -6.017285e-05 -0.0002962075 7.938251e-05 -0.0006369315
## B6NCrl -6.596555e-04 0.0003701609 4.853483e-04 -0.0001221603
## B6NHsd 3.251943e-04 -0.0023512420 -1.946080e-03 -0.0002215841
## B6NTyrCBrdCrlCrl 1.249040e-03 0.0008304876 8.059319e-04 -0.0005678451
## B6JBomTac -2.921477e-04 -0.0004308801 2.956358e-04 -0.0005847944
## B6J 8.914476e-04 0.0007966639 -1.894608e-05 0.0008934752
## ATCC AAAGG ACAG
## B6NTac -4.760371e-04 0.0001649562 -9.431094e-05
## B6NCrl 2.326906e-05 0.0007292390 -8.334183e-05
## B6NHsd -9.738023e-04 -0.0032640963 -1.418911e-03
## B6NTyrCBrdCrlCrl -2.380564e-04 0.0010688596 2.446654e-03
## B6JBomTac -2.671971e-04 0.0004943666 3.432759e-04
## B6J 5.718671e-04 -0.0003942336 6.148712e-05
## AAGTCTGCACACACATACT ACCT ACACAG
## B6NTac -0.0003693063 -0.0005101438 -0.0002956678
## B6NCrl -0.0007367637 0.0006628411 -0.0001270817
## B6NHsd 0.0003864204 -0.0042674535 -0.0001753629
## B6NTyrCBrdCrlCrl -0.0006380546 0.0026445685 0.0004140957
## B6JBomTac -0.0012574995 0.0006555566 -0.0003819965
## B6J 0.0017008338 -0.0005348536 0.0006864821
## AT AAGAG AAGGG AGAGAGGC
## B6NTac -0.002822038 -0.0002509086 8.403835e-05 -4.090651e-04
## B6NCrl 0.001638562 0.0007967816 9.550605e-05 -9.735448e-05
## B6NHsd -0.004908572 -0.0027839155 -8.956826e-04 -9.252141e-05
## B6NTyrCBrdCrlCrl -0.001023197 0.0008232176 1.132484e-03 8.045671e-04
## B6JBomTac 0.001204549 0.0008527776 5.331611e-05 -2.558212e-05
## B6J -0.000407570 -0.0006053300 3.352709e-04 1.654990e-04
## ACATAT ACAGACAGCACAGCACAGC AGC
## B6NTac -8.973783e-04 -0.0001011831 -5.035019e-06
## B6NCrl 3.277600e-04 -0.0006282529 -2.013794e-05
## B6NHsd -8.395672e-04 0.0005911077 1.077951e-05
## B6NTyrCBrdCrlCrl 6.342445e-04 0.0012820218 7.581416e-05
## B6JBomTac -7.849088e-05 -0.0004538098 -2.213871e-04
## B6J 2.805531e-04 0.0008161647 6.779565e-05
## AGAGG AAAGACAGACAG AAAAACAAGGGAGATAT
## B6NTac -5.451441e-04 0.0001919794 -0.0006024016
## B6NCrl 1.976184e-04 -0.0007518187 -0.0007702676
## B6NHsd -1.271293e-03 0.0009523442 0.0027391458
## B6NTyrCBrdCrlCrl 6.162522e-04 -0.0005997973 -0.0013208665
## B6JBomTac 3.186189e-04 -0.0010862144 -0.0010467956
## B6J 9.018506e-05 0.0015819253 0.0009937361
## AGGC ACAGACAGCACAGC AACC AAAAG
## B6NTac -0.0020117145 4.751776e-05 -6.836915e-04 0.0001320768
## B6NCrl -0.0016632974 -7.066192e-04 2.378089e-04 0.0001860139
## B6NHsd 0.0004286222 6.277708e-04 7.886844e-05 0.0001341426
## B6NTyrCBrdCrlCrl -0.0024026780 1.163767e-03 3.846697e-04 0.0003208179
## B6JBomTac -0.0044165306 -4.766036e-04 2.781311e-04 0.0003026365
## B6J 0.0048059615 9.447221e-04 -3.292601e-04 -0.0004063741
## AAAGAG AGATAT AGCAGG ACAGAT
## B6NTac -0.0016948721 -2.063519e-03 -8.270165e-04 -5.103155e-04
## B6NCrl 0.0008036940 1.502680e-03 -4.141238e-04 2.764000e-04
## B6NHsd -0.0034238043 -1.268949e-03 5.412703e-05 -2.417208e-03
## B6NTyrCBrdCrlCrl 0.0008106058 -2.490464e-04 -5.730695e-04 9.071935e-05
## B6JBomTac 0.0010697903 -2.440358e-04 -9.435134e-04 2.287869e-04
## B6J -0.0011144041 6.836742e-05 1.548868e-03 1.104506e-04
## AAAGGG AACAAG AAAAAG AGGAGGC
## B6NTac -4.677551e-04 2.546155e-05 0.0002801716 -9.187078e-04
## B6NCrl 6.409462e-05 8.371351e-04 -0.0004327796 -5.548012e-04
## B6NHsd -1.681945e-03 -2.853162e-03 0.0001672457 1.284541e-05
## B6NTyrCBrdCrlCrl 7.203940e-04 2.164467e-04 0.0001700484 -1.205617e-03
## B6JBomTac -5.027060e-04 -6.728850e-04 -0.0005210630 -5.620871e-04
## B6J 1.042552e-03 8.818891e-07 0.0002534426 1.029203e-03
## AAGC AGAGGG AGAGAGGCAGAGGC AAAGC
## B6NTac -5.434884e-04 0.0004652802 -3.312943e-04 3.483304e-04
## B6NCrl 7.403019e-04 -0.0001436764 8.352441e-05 7.155242e-05
## B6NHsd -3.233108e-04 -0.0007647574 -1.441311e-04 8.826565e-05
## B6NTyrCBrdCrlCrl 2.236890e-04 0.0011788727 4.182530e-04 5.895693e-04
## B6JBomTac -6.453794e-05 0.0007462053 3.510603e-05 7.007548e-04
## B6J 2.297196e-04 -0.0006735322 2.077488e-04 -6.941516e-04
## ACAGGC AAACAAGGGAGATAT ACAGAG ACAT
## B6NTac -0.0005248930 -0.0008268581 0.0008632881 -0.0010080044
## B6NCrl -0.0008088567 -0.0003203606 -0.0001639874 -0.0001848299
## B6NHsd -0.0002515918 0.0009086674 -0.0005174375 -0.0003440389
## B6NTyrCBrdCrlCrl -0.0003289970 -0.0006655922 -0.0005338290 0.0014361438
## B6JBomTac -0.0005080720 -0.0006975355 -0.0002898178 -0.0005412730
## B6J 0.0012970047 0.0008500562 0.0006739503 0.0003718677
## ACT AGAGAT AGGG AAAAAC
## B6NTac -3.081124e-04 -0.0029479031 0.0005669797 0.0002932379
## B6NCrl -7.990339e-05 0.0013842437 -0.0001663411 -0.0012957452
## B6NHsd -3.130408e-03 -0.0038984548 -0.0005714235 0.0046463337
## B6NTyrCBrdCrlCrl -8.992007e-04 -0.0007953841 0.0009922715 -0.0014301993
## B6JBomTac 3.735892e-04 0.0001349109 0.0003589676 -0.0077436549
## B6J 2.365849e-04 -0.0005866912 -0.0004142656 0.0056165427
## AAGAGG AAAGGAAG AAAAATCTTAAAGG ACTGCT
## B6NTac 0.0001734283 -0.0001211827 6.059315e-05 -0.0005373543
## B6NCrl 0.0010380652 0.0011953256 2.555483e-04 -0.0008290947
## B6NHsd -0.0038070492 -0.0034883860 3.463674e-03 -0.0004780371
## B6NTyrCBrdCrlCrl 0.0020798018 0.0009448833 5.935478e-04 -0.0011862780
## B6JBomTac 0.0002366203 0.0009352782 2.402745e-03 -0.0007848794
## B6J 0.0000189325 -0.0009428198 -1.437855e-04 0.0004148049
## ACACAT AAACC AAAACC AAAAACC
## B6NTac -0.0003328363 4.595988e-05 -0.0008159407 0.0002409840
## B6NCrl 0.0003042913 1.862295e-04 -0.0006498141 -0.0003272137
## B6NHsd -0.0010053734 -3.836788e-04 0.0013755596 0.0021085351
## B6NTyrCBrdCrlCrl -0.0017003803 -3.084245e-04 -0.0009651283 -0.0009297406
## B6JBomTac -0.0006450958 2.966610e-04 -0.0007742960 -0.0015244409
## B6J 0.0012565483 -2.042011e-04 0.0012831101 0.0017776114
## ACACATAT AGCAGGAGGAGG ACATAG
## B6NTac -0.0016465201 -4.637637e-06 -9.896685e-04
## B6NCrl 0.0014689497 -5.021070e-04 9.122955e-04
## B6NHsd -0.0040666019 -2.520583e-05 -2.393602e-03
## B6NTyrCBrdCrlCrl 0.0010072023 3.323291e-04 3.911062e-04
## B6JBomTac 0.0009190992 -3.435794e-04 -6.417093e-05
## B6J -0.0009905018 4.401858e-04 8.034789e-04
# need to get it in format to make a box plot. Want to use mu_common_norm
mutation_rate_data <- c()
kmer_names <- c()
for (i in 1:nrow(mu_common_norm)) {
mutation_rate_data <- c(mutation_rate_data, as.numeric(mu_common_norm[i,]) )
kmer_names <- c(kmer_names, colnames(mu_common_norm))
}
mutation_rate_df <- data.frame(mutation_rate_data, kmer_names)
head(mutation_rate_df)
## mutation_rate_data kmer_names
## 1 -0.0001665755 AC
## 2 0.0004075430 AG
## 3 -0.0016936235 AAG
## 4 -0.0004374261 AACCCT
## 5 -0.0036641401 AGAT
## 6 -0.0003605975 AGAGGC
# need to mess with the dimensions
par(mar = c(7,5,2,2))
boxplot (mutation_rate_df[,1] ~ mutation_rate_df[,2], data=mutation_rate_df, las=2, cex.axis=0.5, ylab="Copies changed/gen/copy")
abline(h=0, col="red", lwd=2)
# a plot for the hypermutators - part I (later add in complex)
# add in all the 14 samples
subset_hypermutators <- mu_common_norm[which(rownames(mu_common_norm)=="B6NHsd" | rownames(mu_common_norm)=="B6NJ"),]
# make this into a dataframe that can be a boxplot.
abun_hypermut <- c()
for (j in 1:ncol(mu_common_norm)) {
for (i in 1:nrow(mu_common_norm)) {
abun_hypermut <- c(abun_hypermut, mu_common_norm[i,j])
}
}
line_names <- c(rep(rownames(mu_common_norm), times = ncol(mu_common_norm) ))
kmer_names <- c()
for (i in 1:ncol(mu_common_norm)) {
kmer_names <- c(kmer_names, c(rep(colnames(mu_common_norm)[i], times = nrow(mu_common_norm))))
}
#now make a dataframe
hypermutator_df <- data.frame(line_names, kmer_names, abun_hypermut)
#abun_df
#hypermutator_df$line_names <- factor(hypermutator_df$line_names, levels=c("B6NHsd","B6NJ"))
# this plot needs some work
ggplot(data = hypermutator_df, aes(x = line_names, y = abun_hypermut)) +
geom_boxplot() +
geom_jitter(aes(color=line_names)) +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
# now absolute mutation rates
mu_common_abs <- calc_mutation_rates_abs(data_sorted_common)
mu_common_abs_norm <- calc_norm_mu(data_sorted_common, mu_common_abs)
per_kmer_mu_abs_norm_mean <- colMeans(mu_common_abs_norm)
sort(per_kmer_mu_abs_norm_mean, decreasing=T)[1:10]
## AAAAATCTTAAAGG AAAAAC AAAGACAGACAG AGAT AGGC
## 0.002199034 0.002024336 0.001949065 0.001690533 0.001666607
## AT AGAGAT ACACATAT AACCCT AAAGAG
## 0.001665801 0.001490404 0.001242575 0.001194939 0.001164543
per_line_mu_abs_norm_mean <- rowMeans(mu_common_abs_norm)
plot(per_kmer_mu_abs_norm_mean) # huh, when we take the absolute value of changes, it doesn't stand out. Only when we take the directional value
which(per_kmer_mu_abs_norm_mean==max(per_kmer_mu_abs_norm_mean))
## AAAAATCTTAAAGG
## 55
range(per_kmer_mu_abs_norm_mean)
## [1] 7.796324e-05 2.199034e-03
sort (per_kmer_mu_abs_norm_mean)
## AGC AGAGAGGCAGAGGC AC
## 7.796324e-05 1.682046e-04 2.065710e-04
## AGAGAGGC AACC AGAGGC
## 2.146289e-04 2.208613e-04 2.348638e-04
## AAAAG AGCAGGAGGAGG ACACAG
## 2.540561e-04 2.579746e-04 2.894117e-04
## AAGC AAACC AAAGC
## 3.236761e-04 3.358232e-04 3.458564e-04
## ACATAT AAAAAG AAGGG
## 3.556181e-04 3.626719e-04 3.646905e-04
## AAGG AGG ACAGAG
## 3.823177e-04 3.945726e-04 4.020140e-04
## AGAGG AGAGGG AGGG
## 4.021618e-04 4.082530e-04 4.225192e-04
## ATCC ACC ACAT
## 4.298685e-04 4.387321e-04 4.458417e-04
## AG AAACAAGGGAGATAT ACAG
## 4.701614e-04 5.491447e-04 5.680881e-04
## ACAGGC AACAAG AAGGAG
## 6.252987e-04 6.364125e-04 6.367376e-04
## AAAGGG AAGTCTGCACACACATACT ACAGAT
## 6.376478e-04 6.483114e-04 6.495259e-04
## ACACAT ACTGCT AGGAGGC
## 6.591535e-04 6.928947e-04 6.999353e-04
## AAGAG AGCAGG AAAGG
## 7.261712e-04 7.288115e-04 7.304379e-04
## ACATAG AAGAGG AAAACC
## 7.437793e-04 7.910306e-04 7.939359e-04
## AAAAACAAGGGAGATAT AGATAT AAAGGAAG
## 8.620851e-04 8.624179e-04 9.217655e-04
## ACT AAAG AAAAACC
## 9.223615e-04 9.625643e-04 9.724336e-04
## ACAGC ACCT AAG
## 9.962429e-04 1.057811e-03 1.098304e-03
## ACAGACAGCACAGC ACAGACAGCACAGCACAGC AAAGAG
## 1.129597e-03 1.139886e-03 1.164543e-03
## AACCCT ACACATAT AGAGAT
## 1.194939e-03 1.242575e-03 1.490404e-03
## AT AGGC AGAT
## 1.665801e-03 1.666607e-03 1.690533e-03
## AAAGACAGACAG AAAAAC AAAAATCTTAAAGG
## 1.949065e-03 2.024336e-03 2.199034e-03
# AGC actually has the lowest mutation rate (can I give it a Z score?)
mu <- mean(per_kmer_mu_abs_norm_mean)
sigma <- sd(per_kmer_mu_abs_norm_mean)
(AGC_zscore <- abs(7.584906e-05 - mu)/sigma)
## [1] 1.384379
AGAT <- data_sorted_common[,5]
names(AGAT) <- rownames(data_sorted_common)
obj<-contMap(mouse_tree,AGAT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
par(mar = c(2, 2, 2, 2))
plot(per_line_mu_abs_norm_mean) # 3 and 10 are the highest
mean(mu_common_abs_norm)
## [1] 0.0007450784
# 7.45 E-4 - lower than Daphnia, higher than Chlamy
# very similar to daphnia, daphnia is 2.74 E-3.
Not many kmers with a phylogenetic signal
Look for kmers with a phylo signal, using the legit kmers
calc_mutation_rates_abs_random <- function (df_abun) {
mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
for (i in 1:ncol(df_abun)) {
anc_states <- fastAnc(mouse_tree, sample(df_abun[,i]), vars=T, CI=T)
for (j in 1:nrow(df_abun)) {
subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
mu_mx[j,i] <- (abs(df_abun[j,i] - anc_states$ace[subset_table$nearest_node])) / subset_table$gens
}
}
rownames(mu_mx) <- rownames(df_abun)
colnames(mu_mx) <- colnames(df_abun)
return(mu_mx)
}
new_rownames <- gsub("/", "", rownames(legit_data))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2
rownames(legit_data) <- new_rownames2
mu_mx_rand_1 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_1 <- calc_norm_mu(legit_data, mu_mx_rand_1)
plot(colMeans(mu_mx_rand_norm_1))
#great, we have a few outliers
sort(colMeans(mu_mx_rand_norm_1), decreasing=T)[1:7]
## AAAGACAGACAG AAAGACAG AGCATGG AAAGAAAGG
## 0.052676699 0.031799341 0.011855337 0.006614550
## AAATGAAT AAGATAGATAGATCT AAAGAAAGAAAGAAAGG
## 0.006336106 0.005061688 0.004908095
which(colnames(legit_data)=="AAAGACAG")
## [1] 104
which(colnames(legit_data)=="AAAGACAGACAG")
## [1] 27
which(colnames(legit_data)=="AGCATGG")
## [1] 427
which(colnames(legit_data)=="AAAGAAAGG")
## [1] 80
which(colnames(legit_data)=="AAATGAAT")
## [1] 230
which(colnames(legit_data)=="AAGATAGATAGATCT")
## [1] 433
which(colnames(legit_data)=="AAAGAAAGAAAGAAAGG")
## [1] 133
AAAGACAG <- legit_data[,104]
names(AAAGACAG) <- rownames(legit_data)
AAAGACAG
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 68.345280 57.793219 86.259274 71.408165
## B6JBomTac B6J B10ScSnJ B10SnJ
## 49.622705 94.690465 0.000000 0.000000
## B10ScCr B6NJ B10J B6ByJ
## 0.000000 81.809260 1.901294 51.787538
## B6JEiJ B10ScNHsd
## 62.088058 0.000000
obj<-contMap(mouse_tree,AAAGACAG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
AAAGACAGACAG <- legit_data[,27]
names(AAAGACAGACAG) <- rownames(legit_data)
obj<-contMap(mouse_tree,AAAGACAGACAG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
AAAGACAGACAG
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 881.309078 698.319519 963.295641 838.750237
## B6JBomTac B6J B10ScSnJ B10SnJ
## 739.659689 1001.591049 5.401604 4.415329
## B10ScCr B6NJ B10J B6ByJ
## 5.880901 919.069355 8.217183 729.856715
## B6JEiJ B10ScNHsd
## 843.105703 3.901369
# yes good one
AGCATGG <- legit_data[,427]
names(AGCATGG) <- rownames(legit_data)
obj<-contMap(mouse_tree,AGCATGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
AGCATGG
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 4.234116 2.470631 2.183164 1.252682
## B6JBomTac B6J B10ScSnJ B10SnJ
## 2.135496 2.137580 15.617626 14.967658
## B10ScCr B6NJ B10J B6ByJ
## 17.962139 3.360821 11.100449 1.949454
## B6JEiJ B10ScNHsd
## 1.188934 9.442596
AAAGAAAGG <- legit_data[,80]
names(AAAGAAAGG) <- rownames(legit_data)
obj<-contMap(mouse_tree,AAAGAAAGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
AAAGAAAGG
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 40.503028 95.985217 10.194145 115.770502
## B6JBomTac B6J B10ScSnJ B10SnJ
## 140.923140 50.373755 26.822014 27.493762
## B10ScCr B6NJ B10J B6ByJ
## 4.731663 202.989221 24.439834 58.854640
## B6JEiJ B10ScNHsd
## 95.534331 24.380243
AAATGAAT <- legit_data[,230]
names(AAATGAAT) <- rownames(legit_data)
# not really, just one pair.
obj<-contMap(mouse_tree,AAATGAAT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
# no
AAGATAGATAGATCT <- legit_data[,433]
names(AAGATAGATAGATCT) <- rownames(legit_data)
# nothing here
obj<-contMap(mouse_tree,AAGATAGATAGATCT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
AAAGAAAGAAAGAAAGG <- legit_data[,133]
names(AAAGAAAGAAAGAAAGG) <- rownames(legit_data)
obj<-contMap(mouse_tree,AAAGAAAGAAAGAAAGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
# do it again and see if you get any additional
mu_mx_rand_2 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_2 <- calc_norm_mu(legit_data, mu_mx_rand_2)
par(mar = c(2, 2, 2, 2))
plot(colMeans(mu_mx_rand_norm_2))
mu_mx_rand_3 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_3 <- calc_norm_mu(legit_data, mu_mx_rand_3)
par(mar = c(2, 2, 2, 2))
plot(colMeans(mu_mx_rand_norm_3))
# nope, no others are having phylo signal
# have 6 kmers with phylogenetic signal
Now complex satellites
complex <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/complex_revised_cn.csv", header=T)
samples <- as.vector(complex$Sample)
complex <- as.matrix(complex[,c(2:3)])
rownames(complex) <- samples
complex
## cn_major cn_minor
## B6JEiJ 1048990.8 123362.6
## B10ScNHsd 1060820.1 135657.5
## B6NHsd 1317473.6 129958.4
## B6NTac 1207882.2 136154.8
## B6NJ 1031331.9 129371.0
## B6NTyrCBrdCrlCrl 1239977.1 137196.5
## B6JBomTac 999881.7 130793.7
## B6J 1118980.8 124911.5
## B10ScCr 1068573.0 128685.2
## B10SnJ 923277.8 129303.0
## B6ByJ 1069595.9 125996.3
## B6NCrl 954977.0 123889.9
## B10ScSnJ 971019.3 133243.0
## B10J 1056396.0 143527.2
# visualize:
range(complex[,1])
## [1] 923277.8 1317473.6
# first, major
obj<-contMap(mouse_tree,complex[,1],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
# B6NHsd expansion (1317473), B6NJ loss (1031331.9)
complex[,1]
## B6JEiJ B10ScNHsd B6NHsd B6NTac
## 1048990.8 1060820.1 1317473.6 1207882.2
## B6NJ B6NTyrCBrdCrlCrl B6JBomTac B6J
## 1031331.9 1239977.1 999881.7 1118980.8
## B10ScCr B10SnJ B6ByJ B6NCrl
## 1068573.0 923277.8 1069595.9 954977.0
## B10ScSnJ B10J
## 971019.3 1056396.0
anc_states <- fastAnc(mouse_tree, complex[,1], vars=T, CI=T)
anc_states
## Ancestral character estimates using fastAnc:
## 15 16 17 18 19 20 21 22 23
## 1045425 1009114 1014464 1027204 1045385 1081737 1098129 1109461 1157943
## 24 25 26 27
## 1161155 1147246 1065387 1062584
##
## Variances on ancestral states:
## 15 16 17 18 19 20
## 16995854715 5103246361 4400057772 4106422853 3438407147 4700132074
## 21 22 23 24 25 26
## 3277108126 2784220764 2344618027 1996686318 1609537905 3474444128
## 27
## 2832675811
##
## Lower & upper 95% CIs:
## lower upper
## 15 789903.6 1300947
## 16 869097.3 1149130
## 17 884451.9 1144477
## 18 901604.9 1152804
## 19 930455.1 1160316
## 20 947363.8 1216109
## 21 985926.4 1210331
## 22 1006040.4 1212882
## 23 1063037.1 1252848
## 24 1073573.6 1248736
## 25 1068612.2 1225879
## 26 949856.0 1180918
## 27 958267.5 1166901
#major satellite at ancestor of B6NHsd and B6NJ: 1157943
# gained by B6NHsd:
1317473-1157943 #159,530 copies
## [1] 159530
# lost by B6NJ
1157943-1031331.9 # 126,611 copies lost
## [1] 126611.1
# second, minor
obj<-contMap(mouse_tree,complex[,2],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
complex[,2] # b6NHsd: 129958.4 , B6NJ: 129371
## B6JEiJ B10ScNHsd B6NHsd B6NTac
## 123362.6 135657.5 129958.4 136154.8
## B6NJ B6NTyrCBrdCrlCrl B6JBomTac B6J
## 129371.0 137196.5 130793.7 124911.5
## B10ScCr B10SnJ B6ByJ B6NCrl
## 128685.2 129303.0 125996.3 123889.9
## B10ScSnJ B10J
## 133243.0 143527.2
anc_states <- fastAnc(mouse_tree, complex[,2], vars=T, CI=T)
anc_states
## Ancestral character estimates using fastAnc:
## 15 16 17 18 19 20 21 22
## 130286.5 132589.0 133019.9 133565.6 137054.8 127984.1 128363.0 128873.8
## 23 24 25 26 27
## 130624.7 132347.7 132858.3 126804.8 127297.9
##
## Variances on ancestral states:
## 15 16 17 18 19 20 21 22
## 35461212 10647732 9180555 8567897 7174107 9806649 6837563 5809172
## 23 24 25 26 27
## 4891957 4166011 3358240 7249297 5910272
##
## Lower & upper 95% CIs:
## lower upper
## 15 118614.9 141958.2
## 16 126193.3 138984.6
## 17 127081.2 138958.6
## 18 127828.5 139302.7
## 19 131805.0 142304.5
## 20 121846.3 134122.0
## 21 123237.8 133488.1
## 22 124149.8 133597.9
## 23 126289.6 134959.8
## 24 128347.1 136348.2
## 25 129266.5 136450.1
## 26 121527.6 132082.0
## 27 122532.9 132062.8
# 130624.7
# how many copies and kb did these lines lose and gain?
130624.7 -129958.4 # B6NHsd : lost 666 copies
## [1] 666.3
130624.7 - 129371 # B6NJ : lost 1253 copies
## [1] 1253.7
mu_complex <- calc_mutation_rates(complex)
#mu_complex
mu_complex_norm <- calc_norm_mu(complex, mu_complex)
hypermutator_df$type <- "simple"
head(hypermutator_df)
## line_names kmer_names abun_hypermut type
## 1 B6NTac AC -1.665755e-04 simple
## 2 B6NCrl AC -1.037055e-04 simple
## 3 B6NHsd AC -1.520792e-04 simple
## 4 B6NTyrCBrdCrlCrl AC 4.070876e-05 simple
## 5 B6JBomTac AC -1.399663e-04 simple
## 6 B6J AC 4.700042e-04 simple
abun_complex <- c()
for (j in 1:ncol(mu_complex_norm)) {
for (i in 1:nrow(mu_complex_norm)) {
abun_complex <- c(abun_complex, mu_complex_norm[i,j])
}
}
line_names <- c(rep(rownames(mu_complex_norm), times = ncol(mu_complex_norm) ))
kmer_names <- c()
for (i in 1:ncol(mu_complex_norm)) {
kmer_names <- c(kmer_names, c(rep(colnames(mu_complex_norm)[i], times = nrow(mu_complex_norm))))
}
#now make a dataframe
complex_df <- data.frame(line_names, kmer_names, abun_complex)
complex_df$type <- "complex"
head(complex_df)
## line_names kmer_names abun_complex type
## 1 B6JEiJ cn_major -0.0000916069 complex
## 2 B10ScNHsd cn_major 0.0001069893 complex
## 3 B6NHsd cn_major 0.0009983401 complex
## 4 B6NTac cn_major 0.0003907011 complex
## 5 B6NJ cn_major -0.0015786951 complex
## 6 B6NTyrCBrdCrlCrl cn_major 0.0012629646 complex
colnames(complex_df) <- c("line_names", "kmer_names", "abun", "type")
colnames(hypermutator_df) <- c("line_names", "kmer_names", "abun", "type")
perline_boxplot_df <- rbind(hypermutator_df, complex_df)
#perline_boxplot_df$line_names <- factor(perline_boxplot_df$line_names, levels = c(""))
## make the plot including mu_complex
ggplot(data = perline_boxplot_df, aes(x = line_names, y = abun)) +
geom_boxplot() +
geom_jitter(aes(color=type, alpha=0.6)) +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
mu_complex_abs <- calc_mutation_rates_abs(complex)
mean(mu_complex_abs[,2])
## [1] 26.05974
mu_complex_abs_norm <- calc_norm_mu(complex, mu_complex_abs)
#mu_complex_abs_norm
colMeans(mu_complex_abs)
## cn_major cn_minor
## 559.13985 26.05974
colMeans(mu_complex_abs_norm)
## cn_major cn_minor
## 0.0005004309 0.0001976812
#major: 4.8 E-4, minor: 1.9 E-4
# major satellite has a higher mutation rate than minor, even after normalizing
# might be because of increased recombination -- more distal to the centromere
library(corrplot)
combined_mu_norm <- cbind(mu_complex_norm, mu_common_norm)
combined_mu_norm[,1:10]
## cn_major cn_minor AC AG
## B6JEiJ -9.160690e-05 -1.615809e-04 -1.665755e-04 4.075430e-04
## B10ScNHsd 1.069893e-04 -7.387808e-05 -1.037055e-04 5.307961e-04
## B6NHsd 9.983401e-04 -3.696561e-05 -1.520792e-04 -1.564207e-03
## B6NTac 3.907011e-04 2.792860e-04 4.070876e-05 1.091716e-03
## B6NJ -1.578695e-03 -4.101223e-04 -1.399663e-04 4.652882e-04
## B6NTyrCBrdCrlCrl 1.262965e-03 5.102066e-04 4.700042e-04 -4.409409e-04
## B6JBomTac -5.087033e-04 2.367385e-04 1.526023e-04 1.838519e-04
## B6J 4.575415e-04 -1.616087e-04 9.140961e-05 4.974917e-05
## B10ScCr 1.728454e-04 -1.568226e-04 2.490011e-04 -7.222850e-04
## B10SnJ -2.893223e-04 -8.429768e-05 4.230222e-04 -5.365049e-04
## B6ByJ -1.255219e-04 -8.906739e-05 -3.731076e-04 3.595034e-04
## B6NCrl -7.866809e-04 -2.184930e-04 -1.704128e-04 9.820323e-05
## B10ScSnJ -1.597972e-04 6.259891e-06 -2.642960e-04 -6.374373e-05
## B10J 7.632285e-05 3.422101e-04 -9.510261e-05 6.792768e-05
## AAG AACCCT AGAT AGAGGC
## B6JEiJ -0.0016936235 -0.0004374261 -0.0036641401 -3.605975e-04
## B10ScNHsd 0.0010510781 -0.0008913244 0.0015920712 -3.000287e-04
## B6NHsd -0.0033459144 -0.0009501471 -0.0043442036 2.057965e-06
## B6NTac 0.0012808578 -0.0011192028 0.0009625905 2.326053e-04
## B6NJ 0.0004670982 -0.0021052905 0.0017003030 -2.519822e-05
## B6NTyrCBrdCrlCrl -0.0009093677 0.0033453313 -0.0017989410 2.586653e-04
## B6JBomTac 0.0005484810 0.0011256591 0.0001359405 2.273234e-04
## B6J 0.0005808924 0.0004389475 0.0012644830 9.134611e-05
## B10ScCr -0.0020978667 0.0001943518 -0.0025471971 2.915746e-04
## B10SnJ 0.0016646539 0.0027878050 0.0037813664 5.069733e-04
## B6ByJ 0.0007397267 -0.0014161398 0.0005660927 -4.350530e-04
## B6NCrl 0.0005409763 0.0001317758 0.0005682900 -1.447509e-04
## B10ScSnJ 0.0002449262 -0.0012163915 -0.0002705450 -2.517623e-04
## B10J 0.0002107909 -0.0005693525 0.0004712985 -1.601563e-04
## AAAG AGG
## B6JEiJ -1.514244e-03 -3.295951e-04
## B10ScNHsd 1.043085e-03 -1.410237e-04
## B6NHsd -2.464712e-03 -6.330540e-04
## B6NTac 1.582728e-03 -4.349942e-05
## B6NJ 8.803666e-04 -3.848845e-04
## B6NTyrCBrdCrlCrl -1.402117e-03 7.395840e-04
## B6JBomTac -1.279014e-06 2.846829e-04
## B6J 4.982097e-04 1.979003e-04
## B10ScCr -1.573901e-03 3.587638e-04
## B10SnJ 5.719602e-04 1.111715e-03
## B6ByJ 4.967773e-04 -9.523429e-05
## B6NCrl 6.199326e-04 -1.910845e-04
## B10ScSnJ 3.594212e-04 -2.978637e-04
## B10J 4.671663e-04 -7.151320e-04
cor_mu <- cor(combined_mu_norm)
par(mar=c(3,3,10,2))
corrplot(cor_mu, type="upper", order="original", tl.col="black", tl.cex=0.5, tl.srt=90)
#cor_mu$AAAAACAAGGAGATAT
#cor_mu[,"AAAAACAAGGAGATAT"]
#cor_mu[,30]
# relate copy number changes to sequence variability in the complex satellites
variability <- read.csv(file="~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/variation_centromere_sats.csv", header = T)
variability
## File Line error_rate_major error_rate_minor mu_major
## 1 R104 BL6NTac 0.0356000 0.0448000 453.6645
## 2 R114 BL6NCrl 0.0355000 0.0446000 -872.7919
## 3 R124 BL6NHsd 0.0352000 0.0447000 1156.0206
## 4 R134 BL6NTyrCBrdCrlCrl 0.0354000 0.0448231 1448.9304
## 5 R144 BL6JBomTac 0.0358362 0.0447000 -540.5402
## 6 R2 BL6J 0.0350000 0.0444000 486.1765
## 7 R22 BL10ScSnJ 0.0365000 0.0441000 -162.1085
## 8 R32 BL10SnJ 0.0353000 0.0423000 -291.9592
## 9 R42 BL10ScCr 0.0349000 0.0421000 177.5475
## 10 R52 BL6NJ 0.0352000 0.0443000 -1811.1508
## 11 R62 BL10J 0.0358000 0.0431000 79.7868
## 12 R72 BL6ByJ 0.0355000 0.0447000 -137.8392
## 13 R84 BL6JEiJ 0.0357000 0.0448000 -97.5968
## 14 R94 BL10ScNHsd 0.0357000 0.0431000 111.8450
## mu_minor
## 1 36.9628491
## 2 -28.1580321
## 3 -4.8286223
## 4 67.7851609
## 5 30.1363125
## 6 -20.5724470
## 7 0.8326899
## 8 -11.1769438
## 9 -20.9461098
## 10 -54.4881282
## 11 46.9015225
## 12 -11.4329535
## 13 -20.4892330
## 14 -10.1253442
ggplot(data = variability, aes(x = mu_major, y = error_rate_major)) +
geom_point() +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
ggplot(data = variability, aes(x = mu_minor, y = error_rate_minor)) +
geom_point() +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
Test for phylogenetic signal
data_subset <- data_sorted_common[,5]
names(data_subset) <- rownames(data_sorted_common)
test_phlosig <- phylosig(mouse_tree, data_subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)
data_subset <- data_sorted_common[,27]
names(data_subset) <- rownames(data_sorted_common)
test_phlosig <- phylosig(mouse_tree, data_subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)
test_phlosig$P
## [1] 0.001
phylosig_test <- c()
for (i in 1:ncol(legit_data)) {
subset <- legit_data[,i]
names(subset) <- rownames(legit_data)
test_phlosig <- phylosig(mouse_tree, subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)
phylosig_test <- c(phylosig_test, test_phlosig$P)
}
possibilities <- which(phylosig_test < 0.05)
filtered_possibilities <- c()
for (i in possibilities) {
prop <- min(legit_data[,i])/max(legit_data[,i])
diff <- max(legit_data[,i]) - min(legit_data[,i])
if (prop < 0.3 & diff > 15){
filtered_possibilities <- c(filtered_possibilities, i)
}
}
filtered_possibilities # this is pretty good. But for some reason 80 isn't showing up
## [1] 27 71 72 104 349 427
# the ones in the table:
which(colnames(legit_data)=="AAAGACAGACAG") #27
## [1] 27
which(colnames(legit_data)=="AAAGACAG") #104
## [1] 104
which(colnames(legit_data)=="AGCATGG") # 427
## [1] 427
which(colnames(legit_data)=="AAAGAAAGG") #80
## [1] 80
which(colnames(legit_data)=="AGGGC") #71
## [1] 71
which(colnames(legit_data)=="AAGAAGAAGAGG") #72
## [1] 72
min(legit_data[,27])
## [1] 3.901369
# check all these
phylosig_test[1:10]
## [1] 0.139 0.645 0.385 0.460 0.636 0.124 0.511 0.167 0.585 0.505
phylosig_test[80] # why not significant? this one looks legit.
## [1] 0.445
colnames(legit_data)[15]
## [1] "ACAG"
check <- legit_data[,15]
names(check) <- rownames(legit_data)
check
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 1623.5399 1521.7861 1250.1318 2013.6151
## B6JBomTac B6J B10ScSnJ B10SnJ
## 1680.3956 1627.5711 1148.3129 1168.5650
## B10ScCr B6NJ B10J B6ByJ
## 685.9261 1637.6058 1138.5821 1530.7700
## B6JEiJ B10ScNHsd
## 1546.1709 1123.8266
obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
colnames(legit_data)[71]
## [1] "AGGGC"
check_2 <- legit_data[,71]
names(check_2) <- rownames(legit_data)
check_2
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 56.22773 77.83892 82.72548 86.91108
## B6JBomTac B6J B10ScSnJ B10SnJ
## 51.54837 53.20102 84.08393 189.24518
## B10ScCr B6NJ B10J B6ByJ
## 107.44587 88.20542 99.42593 64.28542
## B6JEiJ B10ScNHsd
## 57.50664 86.89554
obj<-contMap(mouse_tree,check_2,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
colnames(legit_data)[72]
## [1] "AAGAAGAAGAGG"
check_3 <- legit_data[,72]
names(check_3) <- rownames(legit_data)
obj<-contMap(mouse_tree,check_3,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
colnames(legit_data)[80]
## [1] "AAAGAAAGG"
check <- legit_data[,80]
names(check) <- rownames(legit_data)
obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
#check
# want to apply some sort of filtering - requiring a difference of x%?
colnames(legit_data)[349]
## [1] "AAAGAAAGGAAGGAAGG"
check <- legit_data[,349]
names(check) <- rownames(legit_data)
obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
# this one is good
check
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 7.6277800 9.2428867 2.3568429 8.5065947
## B6JBomTac B6J B10ScSnJ B10SnJ
## 13.2646907 17.5286549 9.6558086 11.3011399
## B10ScCr B6NJ B10J B6ByJ
## 2.5862785 9.5971337 8.6801232 0.1576554
## B6JEiJ B10ScNHsd
## 14.2165408 9.8961555
Now, look at the Ymin satellite
Ymin <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/Ymin_cn.csv", header=T)
samples <- as.vector(Ymin$Sample)
Ymin <- as.matrix(Ymin$cn_HOR_Ymin)
rownames(Ymin) <- samples
Ymin
## [,1]
## B6JEiJ 28.63662
## B10ScNHsd 28.77115
## B6NHsd 21.89659
## B6NTac 28.92109
## B6NJ 20.58688
## B6NTyrCBrdCrlCrl 30.30322
## B6JBomTac 29.80160
## B6J 26.39369
## B10ScCr 20.03651
## B10SnJ 26.36346
## B6ByJ 27.44637
## B6NCrl 28.15001
## B10ScSnJ 25.73320
## B10J 28.53049
colnames(Ymin) <- "cn_HOR_Ymin"
# visualize:
range(Ymin[,1])
## [1] 20.03651 30.30322
# first, major
obj<-contMap(mouse_tree,Ymin[,1],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
anc_states <- fastAnc(mouse_tree, Ymin[,1], vars=T, CI=T)
anc_states
## Ancestral character estimates using fastAnc:
## 15 16 17 18 19 20 21 22
## 26.44703 25.73685 25.63808 25.49412 27.32168 27.15721 26.81652 26.57562
## 23 24 25 26 27
## 25.92030 26.34371 25.85353 27.87069 27.97749
##
## Variances on ancestral states:
## 15 16 17 18 19 20 21
## 23.326382 7.004077 6.038968 5.635962 4.719127 6.450813 4.497748
## 22 23 24 25 26 27
## 3.821273 3.217929 2.740402 2.209050 4.768587 3.887776
##
## Lower & upper 95% CIs:
## lower upper
## 15 16.98074 35.91332
## 16 20.54967 30.92403
## 17 20.82152 30.45465
## 18 20.84104 30.14719
## 19 23.06386 31.57950
## 20 22.17911 32.13530
## 21 22.65978 30.97327
## 22 22.74420 30.40705
## 23 22.40433 29.43626
## 24 23.09909 29.58832
## 25 22.94041 28.76665
## 26 23.59062 32.15076
## 27 24.11287 31.84211
mu_Ymin <- calc_mutation_rates(Ymin)
#mu_complex
mu_Ymin
## cn_HOR_Ymin
## B6JEiJ 0.0045591078
## B10ScNHsd 0.0105033902
## B6NHsd -0.0291572766
## B6NTac 0.0250231070
## B6NJ -0.0822913756
## B6NTyrCBrdCrlCrl 0.0695263271
## B6JBomTac 0.0157250869
## B6J -0.0136534962
## B10ScCr -0.0234232196
## B10SnJ 0.0021313171
## B6ByJ 0.0030427341
## B6NCrl 0.0088948335
## B10ScSnJ 0.0003549335
## B10J 0.0087594744
mean(mu_Ymin)
## [1] -3.611772e-07
mu_Ymin_norm <- calc_norm_mu(Ymin, mu_Ymin)
mu_Ymin_norm
## cn_HOR_Ymin
## B6JEiJ 1.635807e-04
## B10ScNHsd 3.844343e-04
## B6NHsd -1.124882e-03
## B6NTac 9.498703e-04
## B6NJ -3.182984e-03
## B6NTyrCBrdCrlCrl 2.689239e-03
## B6JBomTac 5.620620e-04
## B6J -4.880171e-04
## B10ScCr -9.187696e-04
## B10SnJ 8.281188e-05
## B6ByJ 1.134649e-04
## B6NCrl 3.346990e-04
## B10ScSnJ 1.384399e-05
## B10J 3.206053e-04
# absolute mutation rate
mu_Ymin_abs <- calc_mutation_rates_abs(Ymin)
mean(mu_Ymin_abs)
## [1] 0.02121755
range(mu_Ymin_abs)
## [1] 0.0003549335 0.0822913756
mu_Ymin_abs_norm <- calc_norm_mu(Ymin, mu_Ymin_abs)
mean(mu_Ymin_abs_norm)
## [1] 0.0008092332
mu_complex_norm_all <- cbind(mu_complex_norm, mu_Ymin_norm)
hypermutator_df$type <- "simple"
head(hypermutator_df)
## line_names kmer_names abun type
## 1 B6NTac AC -1.665755e-04 simple
## 2 B6NCrl AC -1.037055e-04 simple
## 3 B6NHsd AC -1.520792e-04 simple
## 4 B6NTyrCBrdCrlCrl AC 4.070876e-05 simple
## 5 B6JBomTac AC -1.399663e-04 simple
## 6 B6J AC 4.700042e-04 simple
abun_complex <- c()
for (j in 1:ncol(mu_complex_norm_all)) {
for (i in 1:nrow(mu_complex_norm_all)) {
abun_complex <- c(abun_complex, mu_complex_norm_all[i,j])
}
}
line_names <- c(rep(rownames(mu_complex_norm_all), times = ncol(mu_complex_norm_all) ))
kmer_names <- c()
for (i in 1:ncol(mu_complex_norm_all)) {
kmer_names <- c(kmer_names, c(rep(colnames(mu_complex_norm_all)[i], times = nrow(mu_complex_norm_all))))
}
#now make a dataframe
complex_df <- data.frame(line_names, kmer_names, abun_complex)
complex_df$type <- "complex"
head(complex_df)
## line_names kmer_names abun_complex type
## 1 B6JEiJ cn_major -0.0000916069 complex
## 2 B10ScNHsd cn_major 0.0001069893 complex
## 3 B6NHsd cn_major 0.0009983401 complex
## 4 B6NTac cn_major 0.0003907011 complex
## 5 B6NJ cn_major -0.0015786951 complex
## 6 B6NTyrCBrdCrlCrl cn_major 0.0012629646 complex
colnames(complex_df) <- c("line_names", "kmer_names", "abun", "type")
colnames(hypermutator_df) <- c("line_names", "kmer_names", "abun", "type")
perline_boxplot_df <- rbind(hypermutator_df, complex_df)
#perline_boxplot_df$line_names <- factor(perline_boxplot_df$line_names, levels = c(""))
## make the plot including mu_complex
ggplot(data = perline_boxplot_df, aes(x = line_names, y = abun)) +
geom_boxplot() +
geom_jitter(aes(color=type, alpha=0.6)) +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
library(corrplot)
mu_complex_norm2 <- cbind(mu_complex_norm, mu_Ymin_norm)
combined_mu_norm <- cbind(mu_complex_norm2, mu_common_norm)
combined_mu_norm[,1:10]
## cn_major cn_minor cn_HOR_Ymin AC
## B6JEiJ -9.160690e-05 -1.615809e-04 1.635807e-04 -1.665755e-04
## B10ScNHsd 1.069893e-04 -7.387808e-05 3.844343e-04 -1.037055e-04
## B6NHsd 9.983401e-04 -3.696561e-05 -1.124882e-03 -1.520792e-04
## B6NTac 3.907011e-04 2.792860e-04 9.498703e-04 4.070876e-05
## B6NJ -1.578695e-03 -4.101223e-04 -3.182984e-03 -1.399663e-04
## B6NTyrCBrdCrlCrl 1.262965e-03 5.102066e-04 2.689239e-03 4.700042e-04
## B6JBomTac -5.087033e-04 2.367385e-04 5.620620e-04 1.526023e-04
## B6J 4.575415e-04 -1.616087e-04 -4.880171e-04 9.140961e-05
## B10ScCr 1.728454e-04 -1.568226e-04 -9.187696e-04 2.490011e-04
## B10SnJ -2.893223e-04 -8.429768e-05 8.281188e-05 4.230222e-04
## B6ByJ -1.255219e-04 -8.906739e-05 1.134649e-04 -3.731076e-04
## B6NCrl -7.866809e-04 -2.184930e-04 3.346990e-04 -1.704128e-04
## B10ScSnJ -1.597972e-04 6.259891e-06 1.384399e-05 -2.642960e-04
## B10J 7.632285e-05 3.422101e-04 3.206053e-04 -9.510261e-05
## AG AAG AACCCT AGAT
## B6JEiJ 4.075430e-04 -0.0016936235 -0.0004374261 -0.0036641401
## B10ScNHsd 5.307961e-04 0.0010510781 -0.0008913244 0.0015920712
## B6NHsd -1.564207e-03 -0.0033459144 -0.0009501471 -0.0043442036
## B6NTac 1.091716e-03 0.0012808578 -0.0011192028 0.0009625905
## B6NJ 4.652882e-04 0.0004670982 -0.0021052905 0.0017003030
## B6NTyrCBrdCrlCrl -4.409409e-04 -0.0009093677 0.0033453313 -0.0017989410
## B6JBomTac 1.838519e-04 0.0005484810 0.0011256591 0.0001359405
## B6J 4.974917e-05 0.0005808924 0.0004389475 0.0012644830
## B10ScCr -7.222850e-04 -0.0020978667 0.0001943518 -0.0025471971
## B10SnJ -5.365049e-04 0.0016646539 0.0027878050 0.0037813664
## B6ByJ 3.595034e-04 0.0007397267 -0.0014161398 0.0005660927
## B6NCrl 9.820323e-05 0.0005409763 0.0001317758 0.0005682900
## B10ScSnJ -6.374373e-05 0.0002449262 -0.0012163915 -0.0002705450
## B10J 6.792768e-05 0.0002107909 -0.0005693525 0.0004712985
## AGAGGC AAAG
## B6JEiJ -3.605975e-04 -1.514244e-03
## B10ScNHsd -3.000287e-04 1.043085e-03
## B6NHsd 2.057965e-06 -2.464712e-03
## B6NTac 2.326053e-04 1.582728e-03
## B6NJ -2.519822e-05 8.803666e-04
## B6NTyrCBrdCrlCrl 2.586653e-04 -1.402117e-03
## B6JBomTac 2.273234e-04 -1.279014e-06
## B6J 9.134611e-05 4.982097e-04
## B10ScCr 2.915746e-04 -1.573901e-03
## B10SnJ 5.069733e-04 5.719602e-04
## B6ByJ -4.350530e-04 4.967773e-04
## B6NCrl -1.447509e-04 6.199326e-04
## B10ScSnJ -2.517623e-04 3.594212e-04
## B10J -1.601563e-04 4.671663e-04
cor_mu <- cor(combined_mu_norm)
par(mar=c(3,3,10,2))
corrplot(cor_mu, type="upper", order="original", tl.col="black", tl.cex=0.5, tl.srt=90)